This data set, “Weekly_US_Crude_Oil.csv”, contains the weekly U.S. field production of crude oil in Thousand Barrels per day. The data is from the first week of 1983 to the second week of February 2021. The code below cleans this data to create time series line plots.
We begin by loading the data into R work space and calling the various supportive libraries for cleaning
# Loading the data into R
produnclean <- read.csv("C:\\Users\\User\\Desktop\\attachment_2 (4).csv", header= F, strip.white=T, skip =3)
#Inspecting the Loaded Data
head(produnclean)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 1983-Jan 01/07 8,634 01/14 8,634 01/21 8,634 01/28 8,634
## 2 1983-Feb 02/04 8,660 02/11 8,660 02/18 8,660 02/25 8,660
## 3 1983-Mar 03/04 8,677 03/11 8,677 03/18 8,677 03/25 8,677
## 4 1983-Apr 04/01 8,677 04/08 8,686 04/15 8,686
## 5 1983-May 05/13 8,682 05/20 8,682
## 6 1983-Jun 06/03 8,676 06/10 8,676 06/17 8,676 06/24 8,676
## V10 V11
## 1
## 2
## 3
## 4 04/29 8,686
## 5
## 6
str(produnclean)
## 'data.frame': 458 obs. of 11 variables:
## $ V1 : chr "1983-Jan" "1983-Feb" "1983-Mar" "1983-Apr" ...
## $ V2 : chr "01/07 " "02/04 " "03/04 " "04/01 " ...
## $ V3 : chr "8,634 " "8,660 " "8,677 " "8,677 " ...
## $ V4 : chr "01/14 " "02/11 " "03/11 " "04/08 " ...
## $ V5 : chr "8,634 " "8,660 " "8,677 " "8,686 " ...
## $ V6 : chr "01/21 " "02/18 " "03/18 " "04/15 " ...
## $ V7 : chr "8,634 " "8,660 " "8,677 " "8,686 " ...
## $ V8 : chr "01/28 " "02/25 " "03/25 " "" ...
## $ V9 : chr "8,634 " "8,660 " "8,677 " "" ...
## $ V10: chr "" "" "" "04/29 " ...
## $ V11: chr "" "" "" "8,686 " ...
names(produnclean)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
#Beginning the Cleaning Process by eliminating unnecessary rows and columns
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.1
library(stringr)
library (tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v forcats 0.5.1
## v readr 2.0.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library (lubridate)
## Warning: package 'lubridate' was built under R version 4.1.1
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
attach(produnclean)
Next, we split the month column and generate complete dates to be later converted into POSIX
#Generating Week One's full dates
produnclean <- produnclean %>% separate(V1, into =c('Year', 'Month'), sep='-') %>% subset(select=-Month)
Week1Date<- paste(produnclean$Year, "/", produnclean$V2)
produnclean <- produnclean %>% mutate(Week1Date)%>% subset(select=-V2)
#Generating Week two's full dates
Week2Date<- paste(produnclean$Year, "/", produnclean$V4)
produnclean <- produnclean %>% mutate(Week2Date)%>% subset(select=-V4)
#Generating Week three's full dates
Week3Date<- paste(produnclean$Year, "/", produnclean$V6)
produnclean <- produnclean %>% mutate(Week3Date)%>% subset(select=-V6)
#Generating week four's full dates
Week4Date<- paste(produnclean$Year, "/", produnclean$V8)
produnclean <- produnclean %>% mutate(Week4Date)%>% subset(select=-V8)
#Generating week five's full dates
Week5Date<- paste(produnclean$Year, "/", produnclean$V10)
produnclean <- produnclean %>% mutate(Week5Date)%>% subset(select=-V10)%>% subset(select=-Year)
We then assign the separated dates their correct production values by first joining them into a single string
#Matching week dates to their production values
produnclean <- produnclean %>% unite('Allweek1prod',Week1Date,V3, sep='_') %>% unite('Allweek2prod',Week2Date,V5, sep='_') %>% unite('Allweek3prod',Week3Date,V7, sep='_') %>% unite('Allweek4prod',Week4Date,V9, sep='_') %>% unite('Allweek5prod',Week5Date,V11, sep='_')
We then split the newly formed data frame of all five week’s production across the years into different data frames for narrowed down organization to generate one final data frame containing the two primary columns: Date and Production
#Sub-setting the various production weeks into different data frames for easier analysis
##For week 1
produncleanWk1 <- produnclean %>% select(Allweek1prod)
produncleanWk1 <- produncleanWk1 %>% separate(Allweek1prod, into =c('Date', 'Production'), sep ='_')
##For Week 2
produncleanWk2 <- produnclean %>% select(Allweek2prod)
produncleanWk2 <- produncleanWk2 %>% separate(Allweek2prod, into =c('Date', 'Production'), sep ='_')
##For Week 3
produncleanWk3 <- produnclean %>% select(Allweek3prod)
produncleanWk3 <- produncleanWk3 %>% separate(Allweek3prod, into =c('Date', 'Production'), sep ='_')
##For Week 4
produncleanWk4 <- produnclean %>% select(Allweek4prod)
produncleanWk4 <- produncleanWk4 %>% separate(Allweek4prod, into =c('Date', 'Production'), sep ='_')
##For Week 5
produncleanWk5 <- produnclean %>% select(Allweek5prod)
produncleanWk5 <- produncleanWk5 %>% separate(Allweek5prod, into =c('Date', 'Production'), sep ='_')
#Merging the 5 weeks' data frames into one
cleanprodlist <- list(produncleanWk1, produncleanWk2, produncleanWk3,produncleanWk4,produncleanWk5)
cleanprod <- cleanprodlist %>% reduce(full_join, by='Date')
cleanprod <- cleanprod %>% subset(select= c(Date,Production.x))
cleanprod <- cleanprod %>% rename(Production=Production.x)
In this step, we remove missing and invalid values. Here, we also restructure the data to make the Date and Production string values uniform for later conversion into the required analyzable formats. We then view the data to verify these structural changes.
#Removing missing values
cleanprod <- cleanprod %>% na.exclude(cleanprod)
sum(is.na(cleanprod))
## [1] 0
#Removing the dates that lack production (resulting from week 5 data)
cleanprod <- cleanprod %>% subset(Date!="1983 / " & Date!="1984 / " & Date!="1985 / " & Date!="1986 / " & Date!="1987 / " & Date!="1988 / " & Date!="1989 / " & Date!="1990 / " & Date!="1991 / " & Date!="1992 / " & Date!="1993 / " & Date!="1994 / " & Date!="1995 / " & Date!="1996 / " & Date!="1997 / " & Date!="1998 / " & Date!="1999 / " & Date!="2000 / " & Date!="2001 / " & Date!="2002 / " & Date!="2003 / " & Date!="2004 / " & Date!="2005 / " & Date!="2006 / " & Date!="2007 / " & Date!="2008 / " & Date!="2009 / " & Date!="2010 / " & Date!="2011 / " & Date!="2012 / " & Date!="2013 / " & Date!="2014 / " & Date!="2015 / " & Date!="2016 / " & Date!="2017 / " & Date!="2018 / " & Date!="2019 / " & Date!="2020 / " & Date!="2021 / ")
#Unifying date format
cleanprod <- cleanprod %>% mutate (Date = gsub ('\\ /', "/", Date))
cleanprod <- cleanprod %>% mutate (Date = gsub (" ", "", Date))
cleanprod <- cleanprod %>% mutate (Production = gsub (",", "", Production))
Converting the Date and Column values into POSIXct and numeric formats respectively to enable time series analysis and graphics.
cleanprod <- cleanprod %>% mutate (Date = as.POSIXct(Date, format = "%Y/%m/%d"))
cleanprod <-cleanprod %>% mutate (Production = as.numeric(Production))
Generating the interactive time series trendline for the cleaned weekly data
You can also embed plots, for example:
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout